Set up

data_RQ1a <- read.csv("RQ1a_receptions_by_OH.csv", header = TRUE, sep = ",")
data_RQ1b <- read.csv("RQ1b_recept_perc_OH.csv", header = TRUE, sep = ",")
data_RQ2 <- read.csv("RQ2_recep_attacks_kp.csv", header = TRUE, sep = ",")
data_RQ3 <- read.csv("RQ3_OH_proportion_final2.csv", header = TRUE, sep = ",")
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.4.2     ✔ purrr   1.0.1
## ✔ tibble  3.2.1     ✔ dplyr   1.1.1
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.0.1     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library("ggplot2")
library("Hmisc")
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library("plyr")
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:Hmisc':
## 
##     is.discrete, summarize
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
library("dplyr")
library("aod")
## 
## Attaching package: 'aod'
## The following object is masked from 'package:survival':
## 
##     rats
library("scales")
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library("ggpubr")
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
## 
##     mutate

RQ1 1: Does the ability to win depend on how often OH receive the ball?

Alternative 1: Standard linear model

# showing that standard linear model is not a good idea for matches won or sets won (patterns for sets or matches won are very similar)
# per set
lmod1 = lm(Sets.Won ~ Percent.Receptions.by.OH, data_RQ1a)
with(data_RQ1a, plot(Percent.Receptions.by.OH, Sets.Won))
abline(lmod1) # fit is a bad idea

summary(lmod1)
## 
## Call:
## lm(formula = Sets.Won ~ Percent.Receptions.by.OH, data = data_RQ1a)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32490 -0.12885  0.00158  0.15498  0.31557 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)                0.2066     0.2346   0.881    0.391
## Percent.Receptions.by.OH   0.5416     0.4743   1.142    0.270
## 
## Residual standard error: 0.195 on 16 degrees of freedom
## Multiple R-squared:  0.07535,    Adjusted R-squared:  0.01756 
## F-statistic: 1.304 on 1 and 16 DF,  p-value: 0.2703
# per match
lmod2 = lm(Matches.Won ~ Percent.Receptions.by.OH, data_RQ1a)
with(data_RQ1a, plot(Percent.Receptions.by.OH, Matches.Won))
abline(lmod2) # fit is a bad idea

summary(lmod2)
## 
## Call:
## lm(formula = Matches.Won ~ Percent.Receptions.by.OH, data = data_RQ1a)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.53909 -0.20533  0.00735  0.22768  0.55388 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)                0.1460     0.4029   0.362    0.722
## Percent.Receptions.by.OH   0.6514     0.8148   0.799    0.436
## 
## Residual standard error: 0.3349 on 16 degrees of freedom
## Multiple R-squared:  0.03841,    Adjusted R-squared:  -0.02169 
## F-statistic: 0.6391 on 1 and 16 DF,  p-value: 0.4357
## diagnostics - checking for violations of constant variance, normality, and linearity and possible outliers. 
plot(lmod1)

termplot(lmod1, partial=TRUE,terms = 1)

## finding the correlation between the % of receptions by OH and the % of sets or matches won.
# per set
cor(data_RQ1a$Percent.Receptions.by.OH,data_RQ1a$Sets.Won)
## [1] 0.2745047
# per match
cor(data_RQ1a$Percent.Receptions.by.OH,data_RQ1a$Matches.Won)
## [1] 0.1959897

ANSWER: Although the normality and linearity assumptions seem reasonable, and there are no influential outliers, there seems to be non-constant variance in the data (see trend in plot of fitted values vs square root of standardized residuals). Therefore, a standard linear regression may not be the best fit for the data. Nonetheless, a standard linear model shows that a one-percent increase in reception attempts (by OH per team) is associated with less than one percent increase in sets and matches won, respectively. However, none of these results are statistically significant.

Alternative 2: Generalized linear model: Logistic regression

mean(data_RQ1b$recep_percent_OH)
## [1] 56.36913
#per set
glmod0 = glm(won_set ~ 1, data = data_RQ1b, family = "binomial")
glmod = glm(won_set ~ recep_percent_OH, data = data_RQ1b, family = "binomial")
summary(glmod)
## 
## Call:
## glm(formula = won_set ~ recep_percent_OH, family = "binomial", 
##     data = data_RQ1b)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.238  -1.230   1.120   1.126   1.136  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)
## (Intercept)      0.0980828  0.1783255   0.550    0.582
## recep_percent_OH 0.0004393  0.0029633   0.148    0.882
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1419.9  on 1026  degrees of freedom
## Residual deviance: 1419.8  on 1025  degrees of freedom
## AIC: 1423.8
## 
## Number of Fisher Scoring iterations: 3
anova(glmod0, glmod, test = "Chisq") # likelihood ratio
## Analysis of Deviance Table
## 
## Model 1: won_set ~ 1
## Model 2: won_set ~ recep_percent_OH
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      1026     1419.9                     
## 2      1025     1419.8  1 0.021974   0.8822
# per match
glmod02 = glm(won_match ~ 1, data = data_RQ1b, family = "binomial")
glmod2 = glm(won_match ~ recep_percent_OH, data = data_RQ1b, family = "binomial")
summary(glmod2)
## 
## Call:
## glm(formula = won_match ~ recep_percent_OH, family = "binomial", 
##     data = data_RQ1b)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.271  -1.239   1.096   1.116   1.154  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)      0.055083   0.178374   0.309    0.757
## recep_percent_OH 0.001619   0.002966   0.546    0.585
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1418.2  on 1026  degrees of freedom
## Residual deviance: 1417.9  on 1025  degrees of freedom
## AIC: 1421.9
## 
## Number of Fisher Scoring iterations: 3
anova(glmod02, glmod2, test = "Chisq") # likelihood ratio
## Analysis of Deviance Table
## 
## Model 1: won_match ~ 1
## Model 2: won_match ~ recep_percent_OH
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      1026     1418.2                     
## 2      1025     1417.9  1  0.29811   0.5851

Calculating the odds of losing a set or match at two consecutive value of receptions .

newdata = data.frame(recept_percent_OH= 24) # choose a value of receptions
logodds1 = (coef(glmod2)[1] + coef(glmod2)[2]*newdata) # log of winning at first value of receptions
odds1 = exp(logodds1) # calculate the odds

newdata2 = newdata = data.frame(recep_percent_OH = 25) # choose a value of receptions
logodds2 = (coef(glmod2)[1] + coef(glmod2)[2]*newdata2) # log of winning at second value of receptions
odds2 = exp(logodds2) # calculate the odds

# check interpretation
odds2/odds1
##   recep_percent_OH
## 1         1.001621
exp(coef(glmod2)[2])
## recep_percent_OH 
##         1.001621

ANSWER: A logistic regression model does not perform better than an-intercept model (that considers the mean only).There is no statistical evidence that a higher percentage of reception attempts is associated with a higher likelihood of sets or matches won. The odds of winning at a percent of receptions of 56 is the same as the odds of winning at an average level of reception attempts of 55%.

RQ2: Does the physical load imposed on OH make it more difficult for them to do their jobs effectively?

Standard linear model

# 
data_RQ2$recep_attack_count_mean <- data_RQ2$recep_attack_count - mean(data_RQ2$recep_attack_count)
lmod2 = lm(kill_percent ~ recep_attack_count_mean, data_RQ2)
with(data_RQ2, plot(recep_attack_count_mean, kill_percent))
abline(lmod2)

summary(lmod2)
## 
## Call:
## lm(formula = kill_percent ~ recep_attack_count_mean, data = data_RQ2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.9314  -3.0083  -0.1308   2.4841  23.6778 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             22.82105    0.18007 126.734   <2e-16 ***
## recep_attack_count_mean -0.01004    0.01183  -0.849    0.396    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.299 on 568 degrees of freedom
## Multiple R-squared:  0.001268,   Adjusted R-squared:  -0.0004905 
## F-statistic: 0.721 on 1 and 568 DF,  p-value: 0.3962
# Computing Pearson's product-moment correlation
cor.test(data_RQ2$recep_attack_count, data_RQ2$kill_percent, method = c("pearson", "kendall", "spearman"))
## 
##  Pearson's product-moment correlation
## 
## data:  data_RQ2$recep_attack_count and data_RQ2$kill_percent
## t = -0.84914, df = 568, p-value = 0.3962
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.11738845  0.04665545
## sample estimates:
##         cor 
## -0.03560635

ANSWER: A one-percent increase in receptions and attack attempts is associated with a reduction of .02 units of kill percentage. However, these results could easily occur by chance alone. The kill percentage at an average value of reception and attack attempts (of 22.8%) is 22.82%.

RQ3: What kinds of groupings will we see in the data based on RECEPTIONS?

Testing normality

shapiro.test(data_RQ3$recep_prop)
## 
##  Shapiro-Wilk normality test
## 
## data:  data_RQ3$recep_prop
## W = 0.99001, p-value = 0.3375

Boxplot

ggplot(data = data_RQ3) + 
  geom_boxplot(mapping = aes(x = OH, y = recep_prop)) 

# changing this to a scatter plot 
options(repr.plot.width=8, repr.plot.height=6)
p_scatter = ggplot(data = data_RQ3) + 
  geom_point(mapping = aes(x = OH, y = recep_prop)) + 
  theme_bw()
p_scatter

##This shows that the majority of receptions (proportion) lands between 0.1 and 0.3. A few outliers over 0.45

Density curves

## first looking at all players (OH) holistically
theme_set(theme_bw(base_size=16))
dx <- density(data_RQ3$recep_prop)
data_RQ3%>%
  ggplot(aes(x=recep_prop)) +
  geom_density(alpha = 0.3)+ 
  geom_vline(xintercept=5, size=1, color="red") +
  xlim(c(min(dx$x), # X-axis limits
         c(max(dx$x)))) +
  labs(x= "OH proportion for every OH",
       subtitle="Proportion of receptions by OH relative to their teams' receptions",
       caption="Data Source: Data Volley")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.

## Warning: Please use `linewidth` instead.
## Warning: Removed 1 rows containing missing values (`geom_vline()`).

## data seem to follow a normal distribution.

##then looking just at teams
theme_set(theme_bw(base_size=10))
dx <- density(data_RQ3$recep_prop)
data_RQ3%>%
  ggplot(aes(x=recep_prop, fill = team)) +
  geom_density(alpha = 0.3)+ 
  geom_vline(xintercept=5, size=1, color="red") +
  xlim(c(min(dx$x), # X-axis limits
         c(max(dx$x)))) +
  labs(x= "OH proportion for every team",
       subtitle="Proportion of receptions by OH for each team'",
       caption="Data Source: Data Volley")
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 1 rows containing missing values (`geom_vline()`).

## different distribution patterns for every team

Violin plots

p = ggplot(data_RQ3, aes(x = category, y = recep_prop))
p = p + geom_violin(trim = FALSE, bw = 0.5)
p = p + geom_boxplot(with = 0.2, col = c("#CFB87C", "#565A5C"), fill = c("#CFB87C", "#565A5C"), alpha = 0.25)
## Warning in geom_boxplot(with = 0.2, col = c("#CFB87C", "#565A5C"), fill =
## c("#CFB87C", : Ignoring unknown parameters: `with`
p = p + geom_dotplot(binaxis = 'y', stackdir = 'center', dotsize = 0.6, alpha = 0.2)
p = p + stat_summary(fun.data = mean_sdl, color = c("#CFB87C", "#565A5C"), alpha = 0.5)
p = p +xlab("Team category") + ylab("Proportion or receptions by OH")
p = p + theme_bw()
p
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Histogram with density plot for all teams

# Histogram with density plot
ggplot(data_RQ3, aes(x=recep_prop, fill=category)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#FF6666") +
 geom_vline(aes(xintercept=mean(recep_prop)),
            color="blue", linetype="dashed", size=1)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.

## Warning: Please use `after_stat(density)` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## data follows a normal distribution with mean around 0.20 

Histogram with density plots for CU and other teams

ggplot(data_RQ3, aes(x=recep_prop, fill=category, color=category)) +
  geom_histogram(aes(y=..density..), colour="black", fill="white")+
   geom_density(alpha=.2, fill="#FF6666") +
 geom_vline(aes(xintercept=mean(recep_prop)),
            color="blue", linetype="dashed", size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Vertical boxplots - CU and other teams

min.mean.sd.max <- function(x) {
  r <- c(min(x), mean(x) - 0.5*sd(x), mean(x), mean(x) + 0.5*sd(x), max(x))
  names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
  r
}

p1 <- ggplot(aes(y = recep_prop, x = factor(category)), data = data_RQ3)
p1 <- p1 + stat_summary(fun.data = min.mean.sd.max, geom = "boxplot") + geom_jitter(position=position_jitter(width=.2), size=3) + ggtitle("Boxplot with mean + 0.5 SD, 95%CI, max and min values.") + xlab("Team category") + ylab("Proportion of receptions by OH")
p1

m <- mean(data_RQ3$recep_prop)
sd <- sd(data_RQ3$recep_prop)

low <- m -sd
high <- m + sd
low
## [1] 0.1170786
high
## [1] 0.3125625
## low threshold = 0.12
## high threshold = 0.31

Vertical boxplot - All teams

min.mean.sd.max <- function(x) {
  r <- c(min(x), mean(x) - sd(x), mean(x), mean(x) + sd(x), max(x))
  names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
  r
}

p1 <- ggplot(aes(y = recep_prop, x = factor(team)), data = data_RQ3)
p1 <- p1 + stat_summary(fun.data = min.mean.sd.max, geom = "boxplot") + geom_jitter(position=position_jitter(width=.2), size=3) + ggtitle("Boxplot with mean + 1 SD, 95%CI, max and min values.") + xlab("Team") + ylab("Proportion of receptions and attacks by OH")
p1
## Warning: Removed 2 rows containing missing values (`geom_segment()`).

## Warning: Removed 2 rows containing missing values (`geom_segment()`).

## Warning: Removed 2 rows containing missing values (`geom_segment()`).

## Warning: Removed 2 rows containing missing values (`geom_segment()`).

## Warning: Removed 2 rows containing missing values (`geom_segment()`).

## Warning: Removed 2 rows containing missing values (`geom_segment()`).

## Warning: Removed 2 rows containing missing values (`geom_segment()`).

Horizontal boxplots

p = ggplot(data_RQ3, aes(x=category, y=recep_prop));
p = p + geom_violin(trim = FALSE)
p = p+ geom_boxplot(with = 0.2, col = c("#CFB87C", "#565A5C"), fill = c("#CFB87C", "#565A5C"),alpha = 0.25)
## Warning in geom_boxplot(with = 0.2, col = c("#CFB87C", "#565A5C"), fill =
## c("#CFB87C", : Ignoring unknown parameters: `with`
p = p + theme_minimal()
p = p + coord_flip()
p = p +xlab("Team category") + ylab("Proportion of receptions by OH")
p = p + theme_bw()
p

RQ3: What kinds of groupings will we see in the data based on ATTACKS?

Boxplot

ggplot(data = data_RQ3) + 
  geom_boxplot(mapping = aes(x = OH, y = attacks_prop)) 

# changing this to a scatter plot 
options(repr.plot.width=8, repr.plot.height=6)
p_scatter = ggplot(data = data_RQ3) + 
  geom_point(mapping = aes(x = OH, y = attacks_prop)) + 
  theme_bw()
p_scatter

##This shows that the majority of receptions (proportion) lands between 0.2 and 0.6. A few outliers below 0.1

Density curves

## first looking at all players (OH) holistically
theme_set(theme_bw(base_size=16))
dx <- density(data_RQ3$attacks_prop)
data_RQ3%>%
  ggplot(aes(x=attacks_prop)) +
  geom_density(alpha = 0.3)+ 
  geom_vline(xintercept=5, size=1, color="red") +
  xlim(c(min(dx$x), # X-axis limits
         c(max(dx$x)))) +
  labs(x= "OH proportion for every OH",
       subtitle="Proportion of attacks by OH relative to their teams' attacks",
       caption="Data Source: Data Volley")
## Warning: Removed 1 rows containing missing values (`geom_vline()`).

## data seem to follow a normal distribution.

##then looking just at teams
theme_set(theme_bw(base_size=8))
dx <- density(data_RQ3$attacks_prop)
data_RQ3%>%
  ggplot(aes(x=attacks_prop, fill = team)) +
  geom_density(alpha = 0.3)+ 
  geom_vline(xintercept=5, size=1, color="red") +
  xlim(c(min(dx$x), # X-axis limits
         c(max(dx$x)))) +
  labs(x= "OH proportion for every team",
       subtitle="Proportion of attacks by OH for each team'",
       caption="Data Source: Data Volley")
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 1 rows containing missing values (`geom_vline()`).

## different distribution patterns for every team

Violin plots

p = ggplot(data_RQ3, aes(x = category, y = attacks_prop))
p = p + geom_violin(trim = FALSE, bw = 0.5)
p = p + geom_boxplot(with = 0.2, col = c("#CFB87C", "#565A5C"), fill = c("#CFB87C", "#565A5C"), alpha = 0.25)
## Warning in geom_boxplot(with = 0.2, col = c("#CFB87C", "#565A5C"), fill =
## c("#CFB87C", : Ignoring unknown parameters: `with`
p = p + geom_dotplot(binaxis = 'y', stackdir = 'center', dotsize = 0.6, alpha = 0.2)
p = p + stat_summary(fun.data = mean_sdl, color = c("#CFB87C", "#565A5C"), alpha = 0.5)
p = p +xlab("Team category") + ylab("Proportion or attacks by OH")
p = p + theme_bw()
p
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Histogram with density plot for all teams

# Histogram with density plot
ggplot(data_RQ3, aes(x=attacks_prop, fill=category)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#FF6666") +
 geom_vline(aes(xintercept=mean(attacks_prop)),
            color="blue", linetype="dashed", size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## data follows a normal distribution with mean around 0.4 

Histogram with density plots for CU and other teams

ggplot(data_RQ3, aes(x=attacks_prop, fill=category, color=category)) +
  geom_histogram(aes(y=..density..), colour="black", fill="white")+
   geom_density(alpha=.2, fill="#FF6666") +
 geom_vline(aes(xintercept=mean(attacks_prop)),
            color="blue", linetype="dashed", size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Vertical boxplots - CU and other teams

min.mean.sd.max <- function(x) {
  r <- c(min(x), mean(x) - 0.5*sd(x), mean(x), mean(x) + 0.5*sd(x), max(x))
  names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
  r
}

p1 <- ggplot(aes(y = attacks_prop, x = factor(category)), data = data_RQ3)
p1 <- p1 + stat_summary(fun.data = min.mean.sd.max, geom = "boxplot") + geom_jitter(position=position_jitter(width=.2), size=3) + ggtitle("Boxplot with mean +- 0.5 SD, 95%CI, max and min values.") + xlab("Team category") + ylab("Proportion of attacks by OH")
p1

ma <- mean(data_RQ3$attacks_prop)
sda <- sd(data_RQ3$attacks_prop)

lowa <- ma -sda
higha <- ma + sda
lowa
## [1] 0.2754938
higha
## [1] 0.5308011
## low threshold = 0.28
## high threshold = 0.53

Vertical boxplot - All teams

min.mean.sd.max <- function(x) {
  r <- c(min(x), mean(x) - sd(x), mean(x), mean(x) + sd(x), max(x))
  names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
  r
}

p1 <- ggplot(aes(y = attacks_prop, x = factor(team)), data = data_RQ3)
p1 <- p1 + stat_summary(fun.data = min.mean.sd.max, geom = "boxplot") + geom_jitter(position=position_jitter(width=.2), size=3) + ggtitle("Boxplot with mean + 1 SD, 95%CI, max and min values.") + xlab("Team") + ylab("Proportion of attacks by OH")
p1
## Warning: Removed 2 rows containing missing values (`geom_segment()`).

## Warning: Removed 2 rows containing missing values (`geom_segment()`).

## Warning: Removed 2 rows containing missing values (`geom_segment()`).

## Warning: Removed 2 rows containing missing values (`geom_segment()`).

## Warning: Removed 2 rows containing missing values (`geom_segment()`).

## Warning: Removed 2 rows containing missing values (`geom_segment()`).

## Warning: Removed 2 rows containing missing values (`geom_segment()`).

Horizontal boxplots

p = ggplot(data_RQ3, aes(x=category, y=attacks_prop));
p = p + geom_violin(trim = FALSE)
p = p+ geom_boxplot(with = 0.2, col = c("#CFB87C", "#565A5C"), fill = c("#CFB87C", "#565A5C"),alpha = 0.25)
## Warning in geom_boxplot(with = 0.2, col = c("#CFB87C", "#565A5C"), fill =
## c("#CFB87C", : Ignoring unknown parameters: `with`
p = p + theme_minimal()
p = p + coord_flip()
p = p +xlab("Team category") + ylab("Proportion of attacks by OH")
p = p + theme_bw()
p